The Virginia Department of Transportation maintains a geospatial dataset of reported traffic accidents with the detailed version hosted here and a pared down basic version available here. The former has 85 attributes while the latter contains “only” 66 attributes.
We can pull in the data for Arlington County via the API as shown below:

library(tidyverse)
library(sf)
library(httr)
library(lubridate)
library(ggthemes)
library(janitor)
raw_data_0 <- st_read("https://services.arcgis.com/p5v98VHDX9Atv3l7/arcgis/rest/services/CrashData_test/FeatureServer/0/query?where=CRASH_YEAR%20%3D%20%272021%27&outFields=*&outSR=4326&f=geojson")
## Reading layer `OGRGeoJSON' from data source `https://services.arcgis.com/p5v98VHDX9Atv3l7/arcgis/rest/services/CrashData_test/FeatureServer/0/query?where=CRASH_YEAR%20%3D%20%272021%27&outFields=*&outSR=4326&f=geojson' using driver `GeoJSON'
## Simple feature collection with 97051 features and 67 fields
## geometry type:  POINT
## dimension:      XY
## bbox:           xmin: -83.62613 ymin: 36.54148 xmax: -75.35834 ymax: 39.43103
## geographic CRS: WGS 84
arlington <- raw_data_0 %>%
  filter(PHYSICAL_JURIS == "000. Arlington County")

dim(arlington)
## [1] 1447   68

One thing to know about these data is that they are stored using ESRI’s ArcGIS Hub and I am accessing them through the ESRI REST API, which is great. Howeever, the date fields require some manipulation once the data have been downloaded and as shown in the image below, the CRASH_DT attribute contains the date but not the time of the crash, which is stored in the CRASH_MILITARY_TM attribute:



The code below created two new fields that contain properly formatted versions of the crash date and crash time with the latter stored as an integer rather than a date.

arlington <- arlington %>%
  mutate(CR_DT_LONG = as.POSIXct((as.numeric(CRASH_DT) / 1000), origin = "1970-01-01", tz = "America/New_York"),
         CR_TIME = as.integer(CRASH_MILITARY_TM))
dim(arlington)
## [1] 1447   70
arlington <- arlington %>%
  mutate(CR_DATE = as.Date(format(CR_DT_LONG, "%Y-%m-%d")))

arlington_july_sept <- arlington %>% 
  filter(CR_DATE >= as.Date("07/01/2021", format="%m/%d/%Y") & CR_DATE <= as.Date("09/30/2021", format="%m/%d/%Y"))
dim(arlington_july_sept)
## [1] 493  71

There is a lot going on above. The dates are automatically converted to seconds since the origin date of January 1, 1970 when we pull them from the API and we divide by 1000 to scale the values properly (milliseconds). Time zones are listed here OlsonNames(tzdir = NULL) and you can take a deeper dive here or by taking a look at this page.


Let’s download the Arlington County boundary from the open data portal and create a kernel density representation of crash intensity based on the VDOT data.


ac_bound <- st_read("https://opendata.arcgis.com/datasets/562081e923254d408f30bbf9fff00f8b_0.geojson")
## Reading layer `Arlington_County_Boundary_Polygon' from data source `https://opendata.arcgis.com/datasets/562081e923254d408f30bbf9fff00f8b_0.geojson' using driver `GeoJSON'
## Simple feature collection with 1 feature and 3 fields
## geometry type:  POLYGON
## dimension:      XY
## bbox:           xmin: -77.17232 ymin: 38.82748 xmax: -77.03218 ymax: 38.93431
## geographic CRS: WGS 84
ac_bound_proj <- st_transform(ac_bound, st_crs(arlington_july_sept))


arlington_july_sept %>%
  st_coordinates() %>%
  as_tibble() %>%
  ggplot() +
  stat_density_2d(aes(X, Y, fill = stat(level)),  geom = "polygon", alpha = 0.2,
                  h = 0.01, n = 300) + 
  coord_equal() + 
  geom_sf(data = arlington_july_sept, colour = "red", fill = NA,
          inherit.aes = FALSE) +
  xlab('Longitude') + 
  ylab('Latitude') +
  scale_fill_viridis_c() + 
  geom_sf(data = ac_bound_proj, colour = "grey70", fill = NA, inherit.aes = FALSE) +
  theme_map() + theme(legend.position = "none")

library(tmap)
library(tmaptools)

st_kde <- function(points, cellsize, bandwith, extent = NULL){
  require(MASS)
  require(raster)
  require(sf)
  if(is.null(extent)){
    extent_vec <- st_bbox(points)[c(1,3,2,4)]
  } else{
    extent_vec <- st_bbox(extent)[c(1,3,2,4)]
  }
  
  n_y <- ceiling((extent_vec[4]-extent_vec[3])/cellsize)
  n_x <- ceiling((extent_vec[2]-extent_vec[1])/cellsize)
  
  extent_vec[2] <- extent_vec[1]+(n_x*cellsize)-cellsize
  extent_vec[4] <- extent_vec[3]+(n_y*cellsize)-cellsize
  
  coords <- st_coordinates(points)
  matrix <- kde2d(coords[,1],coords[,2],h = bandwith,n = c(n_x,n_y),lims = extent_vec)
  raster(matrix)
}



choose_bw <- function(dataset) {
  X <- coordinates(dataset)
  sigma <- c(sd(X[,1]),sd(X[,2]))  * (2 / (3 * nrow(X))) ^ (1/6)
  return(sigma/1000)
}


# Note that the second argument is the cellsize and the 
# third argument is the bandwidth
kde_1 <- st_kde(arlington_july_sept, 0.0005, 0.01)
kde_2 <- st_kde(arlington_july_sept, 0.0005, 0.001)
kde_3 <- st_kde(arlington_july_sept, 0.00005, 0.01)


kde_filtered <- kde_3
kde_filtered[kde_filtered < 500] <- NA
summary(kde_filtered)
##                layer
## Min.        500.0002
## 1st Qu.     547.5011
## Median      644.0310
## 3rd Qu.     807.8765
## Max.       1730.0531
## NA's    5173975.0000
# Note that the order of the layers matters... put rasters at top
# and points at the very bottom!
tmap_mode("view")

tm_shape(kde_filtered, raster.downsample = FALSE) +
tm_raster("layer", title = "Kernel Density of Crashes", drop.levels=FALSE,
          breaks = c(500, 100, 500, 750, 1000, 2000), interval.closure="right") +
  tm_shape(arlington_july_sept) +
  tm_dots(size = 0.05, col = "red") +
  tm_basemap("OpenStreetMap") + 
  tm_shape(ac_bound_proj) +
  tm_borders(col = "blue") 


We can use the WEATHER_CONDITION attribute to visualize where weather-affected crashes occur most frequently—again this example dataset includes all crashes between July and September 2021.


arlington_july_sept %>%
  ggplot() +
  geom_bar(aes(x=WEATHER_CONDITION, fill=WEATHER_CONDITION)) +
  scale_fill_manual(values = c("red","blue","green"), name="Weather Conditions") +
  theme_minimal() +
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        axis.title.y=element_blank())

janitor::tabyl(arlington_july_sept$WEATHER_CONDITION)
arlington_july_sept_weather <- arlington_july_sept %>%
   filter(WEATHER_CONDITION != "1. No Adverse Condition (Clear/Cloudy)") 

arlington_july_sept_weather %>%
  st_coordinates() %>%
  as_tibble() %>%
  ggplot() +
  stat_density_2d(aes(X, Y, fill = stat(level)),  geom = "polygon", alpha = 0.2,
                  h = 0.01, n = 300) + 
  coord_equal() + 
  geom_sf(data = arlington_july_sept_weather, colour = "red", fill = NA,
          inherit.aes = FALSE) +
  xlab('Longitude') + 
  ylab('Latitude') +
  scale_fill_viridis_c() + 
  geom_sf(data = ac_bound_proj, colour = "grey70", fill = NA, inherit.aes = FALSE) +
  theme_map() + theme(legend.position = "none")


Sometimes it is easier to understand what is going on with a dynamic map…


library(tmap)
library(tmaptools)

st_kde <- function(points, cellsize, bandwith, extent = NULL){
  require(MASS)
  require(raster)
  require(sf)
  if(is.null(extent)){
    extent_vec <- st_bbox(points)[c(1,3,2,4)]
  } else{
    extent_vec <- st_bbox(extent)[c(1,3,2,4)]
  }
  
  n_y <- ceiling((extent_vec[4]-extent_vec[3])/cellsize)
  n_x <- ceiling((extent_vec[2]-extent_vec[1])/cellsize)
  
  extent_vec[2] <- extent_vec[1]+(n_x*cellsize)-cellsize
  extent_vec[4] <- extent_vec[3]+(n_y*cellsize)-cellsize
  
  coords <- st_coordinates(points)
  matrix <- kde2d(coords[,1],coords[,2],h = bandwith,n = c(n_x,n_y),lims = extent_vec)
  raster(matrix)
}


# Note that the second argument is the cellsize and the 
# third argument is the bandwidth
kde_1 <- st_kde(arlington_july_sept_weather, 0.0005, 0.01)
kde_2 <- st_kde(arlington_july_sept_weather, 0.0005, 0.001)
kde_3 <- st_kde(arlington_july_sept_weather, 0.00005, 0.01)


kde_filtered <- kde_3
kde_filtered[kde_filtered < 500] <- NA
summary(kde_filtered)
##                layer
## Min.        500.0017
## 1st Qu.     607.2590
## Median      787.0010
## 3rd Qu.    1127.3476
## Max.       2213.9569
## NA's    4360891.0000
# Note that the order of the layers matters... put rasters at top
# and points at the very bottom!
tmap_mode("view")

tm_shape(kde_filtered, raster.downsample = FALSE) +
tm_raster("layer", title = "Kernel Density of Weather-Affected Crashes", drop.levels=FALSE,
          breaks = c(500, 100, 500, 750, 1000, 2000, 2500), interval.closure="right") +
  tm_shape(arlington_july_sept_weather) +
  tm_dots(size = 0.05, col = "red") +
  tm_basemap("OpenStreetMap") + 
  tm_shape(ac_bound_proj) +
  tm_borders(col = "blue")